#!/usr/bin/env python
# coding: utf-8

# In[10]:


from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
    raw=gutenberg.raw(fileid)
    num_length=len(raw)
    words=gutenberg.words(fileid)
    num_words=len(words)
    sents=gutenberg.sents(fileid)
    num_sents=len(sents)
    print("“%s”的文本长度为%d,词汇量为%d,句子数量为%d"%(fileid,num_length,num_words,num_sents))


# In[9]:


from  nltk.corpus import brown
print('布朗类别:')
print(brown.categories())
print("布朗语料库news类别文件;")
print(brown.fileids(categories='news'))
print('布朗语料库news词汇')
print(brown.words(categories='news'))
print('布朗语料库news句子')
print(brown.sents(categories='news'))


# In[12]:


from nltk.corpus import reuters
print()
print(reuters.fileids()[:5])
print('')
print()
print(reuters.categories())
print('文件类别所属类别：%s'%(reuters.categories('test/14828')))
print('类别crude和cron对应的文件')
print(reuters.fileids(['crude','cron']))


# In[ ]:


from urllib.request import urlopen 
from zhconv import convert 
url='https://www.gutenberg.org/files/23962/23962-0.txt'
html=urlopen(url).read()
html=html.decode('utf-8')
html=convert(html[600:1000],'zh-hans')
print(html)


# In[ ]:


import re 
text="电脑是20世纪人类最伟大的发ming"
p_string=text.split('。')
for line in p_string:
    if re.match('电脑',line) is not None:
        print('句子',line,"是以电脑开头的")
    elif line:
        print('句子',line,"bu是以电脑开头的")
    if re.search('电脑',line) is not None:
        line=re.sub('电脑','计算机',line)
        print('将电脑替换为计算机的后果:',line)

